Scripts to run CAVIAR, FINEMAP (version 1.1) and DAP-G
# knitr::opts_chunk$set(error = TRUE) # Knit even with errors
# devtools::install_local("geneRefineR/", force = T)
# library("echolocatoR")
source("echolocatoR/R/MAIN.R")
library(readxl)
library(DT)
library(data.table)
library(dplyr)
library(ggplot2)
library(plotly)
library(cowplot)
library(ggrepel)
library(curl)
library(biomaRt)
# Ensembl LD API
# library(httr)
# library(jsonlite)
# library(xml2)
# library(gaston)
# library(RCurl)
# *** susieR ****
# library(knitrBootstrap) #install_github('jimhester/knitrBootstrap')
library(susieR) # devtools::install_github("stephenslab/susieR")
sessionInfo()## R version 3.5.1 (2018-07-02)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS 10.14.4
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] susieR_0.6.2.0411 biomaRt_2.38.0 tidyr_0.8.3
## [4] gaston_1.5.5 RcppParallel_4.4.2 Rcpp_1.0.1
## [7] curl_3.3 ggrepel_0.8.0 cowplot_0.9.4
## [10] plotly_4.9.0 ggplot2_3.1.1 dplyr_0.8.0.1
## [13] data.table_1.12.2 DT_0.5.2 readxl_1.3.1
##
## loaded via a namespace (and not attached):
## [1] lattice_0.20-38 prettyunits_1.0.2 assertthat_0.2.1
## [4] digest_0.6.18 R6_2.4.0 cellranger_1.1.0
## [7] plyr_1.8.4 stats4_3.5.1 RSQLite_2.1.1
## [10] evaluate_0.13 httr_1.4.0 pillar_1.3.1
## [13] rlang_0.3.4 progress_1.2.0 lazyeval_0.2.2
## [16] blob_1.1.1 S4Vectors_0.20.1 Matrix_1.2-17
## [19] rmarkdown_1.12 stringr_1.4.0 htmlwidgets_1.3
## [22] RCurl_1.95-4.12 bit_1.1-14 munsell_0.5.0
## [25] compiler_3.5.1 xfun_0.6 pkgconfig_2.0.2
## [28] BiocGenerics_0.28.0 htmltools_0.3.6 tidyselect_0.2.5
## [31] tibble_2.1.1 expm_0.999-4 IRanges_2.16.0
## [34] XML_3.98-1.19 viridisLite_0.3.0 crayon_1.3.4
## [37] withr_2.1.2 bitops_1.0-6 grid_3.5.1
## [40] jsonlite_1.6 gtable_0.3.0 DBI_1.0.0
## [43] magrittr_1.5 scales_1.0.0 stringi_1.4.3
## [46] tools_3.5.1 bit64_0.9-7 Biobase_2.42.0
## [49] glue_1.3.1 purrr_0.3.2 hms_0.4.2
## [52] parallel_3.5.1 yaml_2.2.0 AnnotationDbi_1.44.0
## [55] colorspace_1.4-1 memoise_1.1.0 knitr_1.22
print(paste("susieR ", packageVersion("susieR"))) ## [1] "susieR 0.6.2.411"
# MINERVA PATHS TO SUMMARY STATISTICS DATASETS:
Data_dirs <- read.csv("./Data/directories_table.csv")
# Direct to local data files if not working on Minerva
if(getwd()=="/Users/schilder/Desktop/Fine_Mapping"){
vcf_folder = F # Use internet if I'm working from my laptop
} else{
vcf_folder = F#"../1000_Genomes_VCFs/Phase1" # Use previously downloaded files if working on Minerva node
}
force_new_subset = F
allResults <- list()“Rare coding variant burden analysis:
We performed kernel-based burden tests on the 113 genes in our PD GWAS loci that contained two or more rare coding variants (MAF< 5% or MAF < 1%). After Bonferroni correction for 113 genes, we identified 7 significant putatively causal genes: - LRRK2, GBA, CATSPER3 (rs11950533/C5orf24 locus) - LAMB2 (rs12497850/IP6K2 locus) - LOC442028 (rs2042477/KCNIP3 locus) - NFKB2 (rs10748818/GBF1 locus), and SCARB2 (rs6825004 locus).”
– from Nalls et al. (2019)
dataset_name <- "Nalls23andMe_2019"
top_SNPs <- Nalls_top_SNPs <- import_topSNPs(
file_path = Directory_info(Data_dirs, dataset_name, "topSumStats"),
chrom_col = "CHR", position_col = "BP", snp_col="SNP",
pval_col="P, all studies", effect_col="Beta, all studies", gene_col="Nearest Gene",
caption= "Nalls et al. (2018) w/ 23andMe PD GWAS Summary Stats")
# Manually add new SNP of interest
top_SNPs <- rbind(top_SNPs,
data.table::data.table(Coord="14:54893485", CHR=14, POS=54893485, SNP="rs3783642", P=NA, Effect=NA, Gene="ATG14"))
gene_list <- c("LRRK2", "TMEM163", "GPNMB", "CTSB", "ATG14")# "LRRK2_alt","SNCA","VPS13C","C5orf24","IP6K2","KCNIP3","GBF1","SCARB2")#, top_SNPs$Gene) %>% unique()
allResults[[dataset_name]] <- finemap_geneList(top_SNPs,
geneList=gene_list,
dataset_name = dataset_name,
dataset_type = "GWAS",
query_by ="coordinates",
subset_path = "auto",
finemap_method = "susie",
# file_path = Directory_info(dataset_name, "fullSumStats"),
fullSS_path = "Data/GWAS/Nalls23andMe_2019/nallsEtAl2019_allSamples_allVariants.mod.txt",
chrom_col = "CHR", position_col = "POS", snp_col = "RSID",
pval_col = "p", effect_col = "beta", stderr_col = "se", MAF_col = "MAF",
vcf_folder = vcf_folder, superpopulation = "EUR", force_new_subset = F,
LD_block = F, block_size = .7,
remove_variants=c("rs34637584"),
num_causal = 1)Results path =
Subset file looks good! :) + Computing LD matrix… LD Reference Panel = 1KG_Phase1 Creating ‘../1000_Genomes_VCFs’ directory. Identified matching VCF subset file. Importing… ../1000_Genomes_VCFs/Phase1/LRRK2_subset.vcf
—— Importing VCF as bed file… ——
++++++++++ Reading in BIM file… ++++++++++
++++++++++ Previously computed LD Matrix identified. Importing… ++++++++++
++++++++++ LD matrix calculated in 1.16 seconds. ++++++++++ Error in subset.default(subset_DT, leadSNP == T) : object ‘leadSNP’ not found
Fine-mapping with SusieR… [1] “objective:-3757.22430938018” [1] “objective:-3754.9881063744” [1] “objective:-3754.98437444501” [1] “objective:-3754.98436821551” [1] “objective:-3754.98436820552” [1] “objective:-3754.98436820551”
Identifying Credible Set…
****** 6 SNPs included in Credible Set ****** + Fine-mapping with ‘susie’ completed in 11 seconds.
LRRK2 fine-mapped in 18.26 seconds
Results path =
Subset file looks good! :) + Computing LD matrix… LD Reference Panel = 1KG_Phase1 Creating ‘../1000_Genomes_VCFs’ directory. Identified matching VCF subset file. Importing… ../1000_Genomes_VCFs/Phase1/TMEM163_subset.vcf
—— Importing VCF as bed file… ——
++++++++++ Reading in BIM file… ++++++++++
++++++++++ Calculating LD ++++++++++
++++++++++ LD matrix calculated in 0.43 seconds. ++++++++++ Error in subset.default(subset_DT, leadSNP == T) : object ‘leadSNP’ not found
Fine-mapping with SusieR… [1] “objective:-2653.78595588928” [1] “objective:-2653.37289639925” [1] “objective:-2653.37264152331” [1] “objective:-2653.3726413677”
Identifying Credible Set…
****** 1 SNPs included in Credible Set ****** + Fine-mapping with ‘susie’ completed in 4 seconds.
TMEM163 fine-mapped in 9.97 seconds
Results path =
Subset file looks good! :) + Computing LD matrix… LD Reference Panel = 1KG_Phase1 Creating ‘../1000_Genomes_VCFs’ directory. Identified matching VCF subset file. Importing… ../1000_Genomes_VCFs/Phase1/GPNMB_subset.vcf
—— Importing VCF as bed file… ——
++++++++++ Reading in BIM file… ++++++++++
++++++++++ Calculating LD ++++++++++
++++++++++ LD matrix calculated in 0.62 seconds. ++++++++++ Error in subset.default(subset_DT, leadSNP == T) : object ‘leadSNP’ not found
Fine-mapping with SusieR… [1] “objective:-3671.85545167874” [1] “objective:-3670.77496658198” [1] “objective:-3670.77321466314” [1] “objective:-3670.77321184107” [1] “objective:-3670.77321183669” [1] “objective:-3670.77321183668”
Identifying Credible Set…
****** 33 SNPs included in Credible Set ****** + Fine-mapping with ‘susie’ completed in 8 seconds.
GPNMB fine-mapped in 14.62 seconds
Results path =
Subset file looks good! :) + Computing LD matrix… LD Reference Panel = 1KG_Phase1 Creating ‘../1000_Genomes_VCFs’ directory. Identified matching VCF subset file. Importing… ../1000_Genomes_VCFs/Phase1/CTSB_subset.vcf
—— Importing VCF as bed file… ——
++++++++++ Reading in BIM file… ++++++++++
++++++++++ Calculating LD ++++++++++
++++++++++ LD matrix calculated in 0.83 seconds. ++++++++++ Error in subset.default(subset_DT, leadSNP == T) : object ‘leadSNP’ not found
Fine-mapping with SusieR… [1] “objective:-4457.84901816636” [1] “objective:-4457.53258739532” [1] “objective:-4457.5323407588” [1] “objective:-4457.53234056553”
Identifying Credible Set…
****** 30 SNPs included in Credible Set ****** + Fine-mapping with ‘susie’ completed in 15 seconds.
CTSB fine-mapped in 21.85 seconds
Results path =
Subset file looks good! :) + Computing LD matrix… LD Reference Panel = 1KG_Phase1 Creating ‘../1000_Genomes_VCFs’ directory. Identified matching VCF subset file. Importing… ../1000_Genomes_VCFs/Phase1/ATG14_subset.vcf
—— Importing VCF as bed file… ——
++++++++++ Reading in BIM file… ++++++++++
++++++++++ Calculating LD ++++++++++
++++++++++ LD matrix calculated in 0.74 seconds. ++++++++++ Error in subset.default(subset_DT, leadSNP == T) : object ‘leadSNP’ not found
Fine-mapping with SusieR… [1] “objective:-3469.47929863011” [1] “objective:-3469.05590716079” [1] “objective:-3469.0550645008” [1] “objective:-3469.05506281449”
Identifying Credible Set…
****** 11 SNPs included in Credible Set ****** + Fine-mapping with ‘susie’ completed in 8 seconds.
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing missing values (geom_text_repel).
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing missing values (geom_text_repel).
ATG14 fine-mapped in 14.65 seconds
cred_set <- subset(allResults[[dataset_name]], credible_set==T & Gene=="LRRK2")
manhattan_plot(# subset_path= get_subset_path(results_path = Directory_info(Data_dirs, dataset_name, "fullSumStats"), gene="LRRK2"),
subset_path="./Data/GWAS/Nalls23andMe_2019/LRRK2/LRRK2_Nalls23andMe_2019_subset.txt",
gene="LRRK2", SNP_list = c("rs76904798","rs34637584","rs117073808"), alt_color_SNPs=c("rs34637584"))dataset_name <- "MESA_AFA"
allResults[[dataset_name]] <- finemap_geneList( geneList=c("LRRK2"),
superpopulation = "AFA",
dataset_name = dataset_name,
dataset_type = "eQTL",
top_SNPs = Nalls_top_SNPs,
query_by = "gene",
# fullSS_path = Directory_info(dataset_name, "fullSumStats"),
fullSS_path = "Data/eQTL/MESA_AFA/LRRK2_AFA_MESA.txt",
subset_path = "auto",
chrom_col = "chr", position_col = "pos_snps", snp_col="snps",
pval_col="pvalue", effect_col="beta", gene_col="gene_name",
stderr_col = "calculate", tstat_col = "statistic",
vcf_folder = vcf_folder, num_causal = 1,
force_new_subset = F,
LD_block = F)Results path =
Subset file looks good! :) + Computing LD matrix… LD Reference Panel = 1KG_Phase1 Creating ‘../1000_Genomes_VCFs’ directory. Identified matching VCF subset file. Importing… ../1000_Genomes_VCFs/Phase1/LRRK2_subset.vcf
—— Importing VCF as bed file… ——
++++++++++ Reading in BIM file… ++++++++++
++++++++++ Calculating LD ++++++++++
++++++++++ LD matrix calculated in 1.53 seconds. ++++++++++ Error in subset.default(subset_DT, leadSNP == T) : object ‘leadSNP’ not found
Fine-mapping with SusieR… [1] “objective:-5763.20635450473” [1] “objective:-5763.20613841342” [1] “objective:-5763.20613798223” [1] “objective:-5763.20613798137”
Identifying Credible Set…
— ERROR — ****** Could NOT identify credible set. Default to SNPs with the top 5 PIPs ****** + Fine-mapping with ‘susie’ completed in 30 seconds.
LRRK2 fine-mapped in 36.89 seconds
dataset_name <- "MESA_CAU"
allResults[[dataset_name]] <- finemap_geneList( geneList=c("LRRK2"),
superpopulation = "CAU",
dataset_name = dataset_name,
dataset_type = "eQTL",
top_SNPs = Nalls_top_SNPs,
query_by = "gene",
# fullSS_path = Directory_info(dataset_name, "fullSumStats"),
fullSS_path = "Data/eQTL/MESA_CAU/LRRK2_CAU_MESA.txt",
subset_path = "auto",
chrom_col = "chr", position_col = "pos_snps", snp_col="snps",
pval_col="pvalue", effect_col="beta", gene_col="gene_name",
stderr_col = "calculate", tstat_col = "statistic",
vcf_folder = vcf_folder, num_causal = 1,
force_new_subset = F,
LD_block = F)Results path =
Subset file looks good! :) + Computing LD matrix… LD Reference Panel = 1KG_Phase1 Creating ‘../1000_Genomes_VCFs’ directory. Identified matching VCF subset file. Importing… ../1000_Genomes_VCFs/Phase1/LRRK2_subset.vcf
—— Importing VCF as bed file… ——
++++++++++ Reading in BIM file… ++++++++++
++++++++++ Calculating LD ++++++++++
++++++++++ LD matrix calculated in 0.93 seconds. ++++++++++ Error in subset.default(subset_DT, leadSNP == T) : object ‘leadSNP’ not found
Fine-mapping with SusieR… [1] “objective:-3771.01872348208” [1] “objective:-3770.81266495247” [1] “objective:-3770.81212407217” [1] “objective:-3770.81212264564”
Identifying Credible Set…
— ERROR — ****** Could NOT identify credible set. Default to SNPs with the top 5 PIPs ****** + Fine-mapping with ‘susie’ completed in 9 seconds.
LRRK2 fine-mapped in 15.73 seconds
Used the Ad Mixed American (AMR) reference panel given the absence of a Hispanic reference panel for 1KG_Phase1.
dataset_name <- "MESA_HIS"
allResults[[dataset_name]] <- finemap_geneList( geneList=c("LRRK2"),
superpopulation = "HIS",
dataset_name = dataset_name,
dataset_type = "eQTL",
top_SNPs = Nalls_top_SNPs,
query_by = "gene",
# fullSS_path = Directory_info(dataset_name, "fullSumStats"),
fullSS_path = "Data/eQTL/MESA_HIS/LRRK2_HIS_MESA.txt",
subset_path = "auto",
chrom_col = "chr", position_col = "pos_snps", snp_col="snps",
pval_col="pvalue", effect_col="beta", gene_col="gene_name",
stderr_col = "calculate", tstat_col = "statistic",
vcf_folder = vcf_folder, num_causal = 1,
force_new_subset = F,
LD_block = F)Results path =
Subset file looks good! :) + Computing LD matrix… LD Reference Panel = 1KG_Phase1 Creating ‘../1000_Genomes_VCFs’ directory. Identified matching VCF subset file. Importing… ../1000_Genomes_VCFs/Phase1/LRRK2_subset.vcf
—— Importing VCF as bed file… ——
++++++++++ Reading in BIM file… ++++++++++
++++++++++ Calculating LD ++++++++++
++++++++++ LD matrix calculated in 0.96 seconds. ++++++++++ Error in subset.default(subset_DT, leadSNP == T) : object ‘leadSNP’ not found
Fine-mapping with SusieR… [1] “objective:-4704.09026794048” [1] “objective:-4703.84677509606” [1] “objective:-4703.8464861313” [1] “objective:-4703.84648578904”
Identifying Credible Set…
****** 19 SNPs included in Credible Set ****** + Fine-mapping with ‘susie’ completed in 16 seconds.
LRRK2 fine-mapped in 22.12 seconds
# +++++++++++++++ CD14 Stimulation +++++++++++++++ #
dataset_name <- "Fairfax_2014_CD14"
allResults[[dataset_name]] <- finemap_geneList( geneList=c("LRRK2"),
superpopulation = "CAU",
dataset_name = dataset_name,
dataset_type = "eQTL",
top_SNPs = Nalls_top_SNPs,
query_by = "probes",
probe_path = "./Data/eQTL/gene.ILMN.map",
file_sep = " ",
# fullSS_path = Directory_info(dataset_name, "fullSumStats"),
fullSS_path = "./Data/eQTL/Fairfax_2014_CD14/LRRK2_Fairfax_CD14.txt",
subset_path = "auto",
chrom_col = "CHR", position_col = "POS", snp_col="SNP",
pval_col="p-value", effect_col="beta", gene_col="gene",
stderr_col = "calculate", tstat_col = "t-stat",
vcf_folder = vcf_folder, num_causal = 1,
force_new_subset = force_new_subset,
LD_block = T)
# +++++++++++++++ IFN Stimulation +++++++++++++++ #
dataset_name <- "Fairfax_2014_IFN"
allResults[[dataset_name]] <- finemap_geneList( geneList=c("LRRK2"), superpopulation = "CAU",
top_SNPs = Nalls_top_SNPs, query_by = "coordinates_merged",
file_sep = " ",
fullSS_path = Directory_info(dataset_name, "fullSumStats"),
subset_path = "Data/eQTL/Fairfax/LRRK2_Fairfax_IFN_subset.txt",
chrom_col = "SNP", position_col = "SNP", snp_col="SNP",
pval_col="p-value", effect_col="beta", gene_col="gene",
stderr_col = "calculate", tstat_col = "t-stat",
vcf_folder = vcf_folder, num_causal = 1,
force_new_subset = force_new_subset,
LD_block = T)
# +++++++++++++++ LPS Stimulation (after 2 hours) +++++++++++++++ #
dataset_name <- "Fairfax_2014_LPS2"
allResults[[dataset_name]] <- finemap_geneList( geneList=c("LRRK2"), superpopulation = "CAU",
top_SNPs = Nalls_top_SNPs,
# query_by = "probes", # Use if querying the main file
query_by = "coordinates_merged",
file_sep = " ",
fullSS_path = Directory_info(dataset_name, "fullSumStats"),
subset_path = "Data/eQTL/Fairfax/LRRK2_Fairfax_LPS2_subset.txt",
chrom_col = "SNP", position_col = "SNP", snp_col="SNP",
pval_col="p-value", effect_col="beta", gene_col="gene",
stderr_col = "calculate", tstat_col = "t-stat",
vcf_folder = vcf_folder, num_causal = 1,
force_new_subset = force_new_subset,
LD_block = T)
# +++++++++++++++ LPS Stimulation (after 24 hours) +++++++++++++++ #
dataset_name <- "Fairfax_2014_LPS24"
allResults[[dataset_name]] <- finemap_geneList( geneList=c("LRRK2"), superpopulation = "CAU",
top_SNPs = Nalls_top_SNPs, query_by = "coordinates_merged",
file_sep = " ",
fullSS_path = Directory_info(dataset_name, "fullSumStats"),
subset_path = "Data/eQTL/Fairfax/LRRK2_Fairfax_LPS24_subset.txt",
chrom_col = "SNP", position_col = "SNP", snp_col="SNP",
pval_col="p-value", effect_col="beta", gene_col="gene",
stderr_col = "calculate", tstat_col = "t-stat",
vcf_folder = vcf_folder, num_causal = 1,
force_new_subset = force_new_subset,
LD_block = T)merged_results <- merge_finemapping_results(allResults, csv_path ="Data/merged_finemapping_results.csv")
createDT(merged_results, caption = "Finemapped Credible Sets for Multiple Datasets")# table(as.character(merged_results$SNP))